import time,requests, re #time用于延时，requests用于请求网页数据，json转换json数据格式，re正则
from lxml import etree #解析xpath网页结构
import pandas as pd #处理表格进行数据分析
import pymysql

def getPage(url):#获取链接中的网页内容
    # 网络请求时需要包含较为完整的headers(请求头)
    # 这里的headers就是requests header的内容
    # 除了cookie 和 user-agent其他都和我的电脑一样
    headers = {
        'Accept':'application/json, text/javascript, */*; q=0.01',
        'Accept-Encoding':'gzip, deflate',
        'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8',
        'Connection':'keep-alive',
        'cookie':'QN1=O5cv7V3FXX6RVSo2FsZGAg==; QN205=s%3Dgoogle; QN277=s%3Dgoogle; csrfToken=AUBSZ1SJgtl2cBX2HyuS2BTEXvmciXKc; _i=RBTKS082R8KxQVZx6JB9OtPmNyxx; QN269=1EDA0650022211EA926DFA163E72396C; fid=3ac01f31-9181-4ed7-a38a-5c0d137aefcc; QN99=1500; QN300=organic; QunarGlobal=10.86.213.148_-72515c99_16e4b45ba14_623b|1573223249997; _vi=hCXRv3fD46caoElZPDCXIreqcskT0Spj7f9XqyWhjfxmL_u3Y7xhSmU49JoeKnflmT0XtDD78wrAbOYIR_IG3FwU-fkUzn927J70W0JK0Iy219BfJnqA_WGLcMw_p8UHFhEtG9kQ-SOa3-aG4lJkhxTME8JhUdpjG4Kl7_X1KFGy; QN601=28e00ebdfc5eae6e44c48a34869c6dd1; QN163=0; QN667=B; QN48=09fd9580-c998-4f2a-bd31-516ddfaeb4f3; QN100=WyLlpKfov57mma%2Fngrl85aSn6L%2BeIl0%3D; QN243=19; JSESSIONID=568D43DF4624B99A32042DCC50F984B8; QN57=15732237429980.552477010226762; Hm_lvt_15577700f8ecddb1a927813c81166ade=1573223744; QN267=91363111d04a42d3; QN58=1573223742995%7C1573223864461%7C2; Hm_lpvt_15577700f8ecddb1a927813c81166ade=1573223865; QN271=88a69a10-00a3-42e0-a9dd-dcd01d16ca04',
        'Host':'piao.qunar.com',
        'Referer':'http://piao.qunar.com/ticket/list.htm?keyword=%E7%83%AD%E9%97%'
                  'A8%E6%99%AF%E7%82%B9&region=&from=mpl_search_suggest',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36'
                     ' (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
        'X-Requested-With':'XMLHttpRequest'
    }
    try:
        page = requests.get(url,headers = headers) # 这里是请求网页数据了
        return page
    except Exception as e:
        print(str(e))
 
def getList():
    #places = '西藏5A'
    plances=['北京5A','天津5A','上海5A','重庆5A','内蒙古5A','广西5A','西藏5A','宁夏5A','新疆5A','香港5A','澳门5A','河北5A',
    '山西5A','辽宁5A','吉林5A','黑龙江5A','江苏5A','浙江5A','安徽5A','福建5A','江西5A','山东5A','河南5A','湖北5A','湖南5A',
    '广东5A','海南5A','四川5A','贵州5A','云南5A','陕西5A','甘肃5A','青海5A','台湾5A']
    list1 = []
    for place in plances:
        sightlist = getList2(place)
        list1.extend(sightlist)
    return list1    
          
 
def getList2(place):
    url = 'http://piao.qunar.com/ticket/list.htm?keyword='+ place +'&region=&from=mpl_search_suggest&page={}'
	#url = 'https://travel.qunar.com/p-cs300134-'+place+'-jingdian-1-{}'
    i = 1
    sightlist = []
    while i < 4:
        page = getPage(url.format(i)) #这里调用了getPage函数获取了网页数据
        selector = etree.HTML(page.text)
        print('正在爬取第', str(i), '页景点信息')
        i += 1
		#下面生成的是列表
        informations = selector.xpath('.//div[@class="result_list"]/div')
        for inf in informations: #获取必要信息
            sight_name = inf.xpath('.//a[@class="name"]/@title')[0]
            print(sight_name)
            sight_level = inf.xpath('.//span[@class="level"]/text()')
            if len(sight_level):
                sight_level = sight_level[0].replace('景区','')
            else:
                sight_level = 0
            sight_area = inf.xpath('.//span[@class="area"]/a/text()')[0]
            sight_hot = inf.xpath('.//span[@class="product_star_level"]/em/span/text()')[0].replace('热度 ','')
            sight_add = inf.xpath('.//p[@class ="address color999"]/span/text()')[0]
            sight_add = re.sub('地址：|（.*?）|\(.*?\)|，.*?$|\/.*?$','',str(sight_add))
            sight_slogen = inf.xpath(".//div[@class='intro color999']/text()")
            if len(sight_slogen):
                sight_slogen=sight_slogen[0]
            else:
                sight_slogen = ' '
            sight_price = inf.xpath(".//span[@class='sight_item_price']/em/text()")
            if len(sight_price):
                sight_price = sight_price[0]
            else:
                sight_price = 0
            sight_soldnum = inf.xpath('.//span[@class="hot_num"]/text()')
            if len(sight_soldnum):
                sight_soldnum = sight_soldnum[0]
            else:
                sight_soldnum = 0
            #sight_point = inf.xpath('.//div[@class="result_list"]/div/@data-point')[0]
            sight_point = inf.xpath('.//@data-point')[0]
            sight_la ,sight_lo = sight_point.split(',')
            print(sight_la,sight_lo,sight_area)
            #print(type(sight_point))
            #.为当前序号，不加则从第一个开始
            sight_url = inf.xpath('.//div[@class="sight_item_pop"]//a/@href')[0]
            sightlist.append((sight_name,sight_level,sight_area.split('·')[0],int(sight_soldnum),float(sight_hot)))
        time.sleep(10)
    return sightlist
 
def sqldb(list):
    db = pymysql.connect("localhost","root","root","hsd_wlrh" )
    cursor = db.cursor()
    cursor.executemany('insert into tourist(tourist_name,type,province,sale_count,hot,tourist_time) values(%s,%s,%s,%s,%s,now())', list)
    db.commit()
    cursor.close()
    db.close()
    
def main():
    list1 = getList() # main后第一个运行getList()
    if len(list1):
       sqldb(list1)   
 
if __name__=='__main__': # 代码是从main函数开始的
    main()
